{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Tutorial for Paribas kaggle competition\n",
    "The data for this tutorial can be obtained from this page https://www.kaggle.com/c/bnp-paribas-cardif-claims-management/data (you would have to register a kaggle account or just login with facebook or google+)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "from itertools import combinations\n",
    "from catboost import CatBoostClassifier"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "train_df = pd.read_csv('../input/train.csv')\n",
    "test_df  = pd.read_csv('../input/test.csv')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## § Base approach"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "With this base approach we were able to achieve **22d** place on the leaderboard, while doing no feature engineering at all: only few basic steps to prepare data and feed it into CatBoost model"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Prepare data:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "labels = train_df.target\n",
    "test_id = test_df.ID\n",
    "\n",
    "train_df.drop(['ID', 'target'], axis=1, inplace=True)\n",
    "test_df.drop(['ID'], axis=1, inplace=True)\n",
    "\n",
    "train_df.fillna(-9999, inplace=True)\n",
    "test_df.fillna(-9999, inplace=True)\n",
    "\n",
    "# Keep list of all categorical features in dataset to specify this for CatBoost\n",
    "cat_features_ids = np.where(train_df.apply(pd.Series.nunique) < 30000)[0].tolist()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Train the model:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<catboost._CatBoostBase at 0x7fe2b4fe7dd0>"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "clf = CatBoostClassifier(learning_rate=0.1, iterations=1000, random_seed=0)\n",
    "clf.fit(train_df, labels, cat_features=cat_features_ids)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Make submission:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "prediction = clf.predict_proba(test_df)[:,1]\n",
    "\n",
    "pd.DataFrame(\n",
    "    {'ID':test_id, 'PredictedProb':prediction}\n",
    ").to_csv(\n",
    "    'submission_base.csv', index=False\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## § Improved approach"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Now by adding few more steps of data manipulation and feature engineering we have acheived **11th** place on the leaderboard"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "selected_features = [\n",
    "    'v10', 'v12', 'v14', 'v21', 'v22', 'v24', 'v30', 'v31', 'v34', 'v38', 'v40', 'v47', 'v50',\n",
    "    'v52', 'v56', 'v62', 'v66', 'v72', 'v75', 'v79', 'v91', 'v112', 'v113', 'v114', 'v129'\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# drop some of the features that were not selected\n",
    "train_df = train_df[selected_features]\n",
    "test_df = test_df[selected_features]\n",
    "\n",
    "# update the list of categorical features\n",
    "cat_features_ids = np.where(train_df.apply(pd.Series.nunique) < 30000)[0].tolist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "char_features = list(train_df.columns[train_df.dtypes == np.object])\n",
    "char_features_without_v22 = list(train_df.columns[(train_df.dtypes == np.object) & (train_df.columns != 'v22')])\n",
    "\n",
    "cmbs = list(combinations(char_features, 2)) + map(lambda x: (\"v22\",) + x, combinations(char_features_without_v22, 2))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def concat_columns(df, columns):\n",
    "    value = df[columns[0]].astype(str) + ' '\n",
    "    for col in columns[1:]:\n",
    "        value += df[col].astype(str) + ' '\n",
    "    return value\n",
    "\n",
    "# add new features based on combinations/interactions\n",
    "for cols in cmbs:\n",
    "    train_df[\"\".join(cols)] = concat_columns(train_df, cols)\n",
    "    test_df[\"\".join(cols)] = concat_columns(test_df, cols)\n",
    "\n",
    "# add new engineered features to the list of categorical features in dataframe\n",
    "cat_features_ids += range(len(selected_features), train_df.shape[1])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Train the model:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<catboost._CatBoostBase at 0x7fe2b44462d0>"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "clf = CatBoostClassifier(learning_rate=0.1, iterations=1000, random_seed=0)\n",
    "clf.fit(train_df, labels, cat_features=cat_features_ids)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Make submission:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "prediction = clf.predict_proba(test_df)[:,1]\n",
    "\n",
    "pd.DataFrame(\n",
    "    {'ID':test_id, 'PredictedProb':prediction}\n",
    ").to_csv(\n",
    "    'submission_improved.csv', index=False\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## § Bagging"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Finally by averaging predictions from several models trained with different seed we reduce the variance and are able to achieve **9th** place on the leaderboard"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "predictions = []\n",
    "\n",
    "for i in range(10):\n",
    "    clf = CatBoostClassifier(learning_rate=0.1, iterations=1000, random_seed=i)\n",
    "    clf.fit(train_df, labels, cat_features=cat_features_ids)\n",
    "    predictions.append(clf.predict_proba(test_df)[:,1])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Make submission:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "prediction = np.mean(predictions, axis=0)\n",
    "\n",
    "pd.DataFrame(\n",
    "    {'ID':test_id, 'PredictedProb':prediction}\n",
    ").to_csv(\n",
    "    'submission_improved_bagged.csv', index=False\n",
    ")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
